5. In Chapter 4, we used logistic regression to predict the probability of default using income and balance on the Default data set. We will now estimate the test error of this logistic regression model using the validation set approach. Do not forget to set a random seed before beginning your analysis.

#settings
library(ggplot2)
library(gridExtra)
library(MASS)
#load data
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
## 
##     Boston
data("Default")
default_table = table(Default$default)
default_table
## 
##   No  Yes 
## 9667  333

ans) Labels in the data are biased.

barplot(default_table, 
        main = "default histogram",  # 히스토그램 제목
        xlab = "Default Status",  # x축 레이블
        ylab = "Frequency",  # y축 레이블
        col = c("skyblue", "orange"),  # 막대 색상 지정
        border = "black"  # 막대 테두리 색상 지정
)

ans) The samples are biased to one side.

scatter plot And box plot : default / balance / income

library(ggplot2)

# Scatter plot : balance ~ income
ggplot(Default, aes(x = balance, y = income, color = default)) +
  geom_point(alpha = 0.6) +
  labs(x = "balance", y = "income", color = "Default Status")

# box plot : default ~ balance
ggplot(Default, aes(x = default, y = balance)) +
  geom_boxplot() + 
  labs(x = "default", y = "balance")

# box plot : default ~ income
ggplot(Default, aes(x = default, y = income)) +
  geom_boxplot() + 
  labs(x = "default", y = "income")

#### additional(log scale).

Default$log_balance = log(Default$balance)
Default$log_income = log(Default$income)

# Scatter plot : log_balance ~ log_income
ggplot(Default, aes(x = log_balance, y = log_income, color = default)) +
  geom_point(alpha = 0.6) +
  labs(x = "log_balance", y = "log_income", color = "Default Status")

# box plot : default ~ log_balance
ggplot(Default, aes(x = default, y = log_balance)) +
  geom_boxplot() + 
  labs(x = "default", y = "log_balance")
## Warning: Removed 499 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# box plot : default ~ log_income
ggplot(Default, aes(x = default, y = log_income)) +
  geom_boxplot() + 
  labs(x = "default", y = "log_income")

ans)
Taking a log on the data doesn’t seem appropriate.

addiational(LDA) : supervised

# LDA 수행
lda_model <- lda(default ~ balance + income, data = Default)

# Scatter plot 그리기
ggplot(Default, aes(x = balance, y = income, color = default)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = -lda_model$scaling[2] / lda_model$scaling[1], intercept = lda_model$scaling[3] / lda_model$scaling[1], linetype = "dashed", color = "blue") +
  labs(x = "Balance", y = "Income", color = "Default") +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_abline()`).

addional(PCA) : unsupervised

pca_model <- prcomp(Default[, c("balance", "income")], scale = TRUE)

pca_data <- as.data.frame(pca_model$x)

ggplot(pca_data, aes(x = PC1, y = PC2, color = Default$default)) +
  geom_point(alpha = 0.6) +
  labs(x = "Principal Component 1 (PC1)", y = "Principal Component 2 (PC2)", color = "Default") +
  theme_minimal()

ans)
Performing log, LDA, and PCA doesn’t seem significant.

(5) Fit a logistic regression model that uses income and balance to predict default.

set.seed(222)
model_1 = glm(default ~ income + balance, data = Default, family = "binomial")

new_data = data.frame(balance = seq(min(Default$balance), max(Default$balance), length.out = 100), income = seq(min(Default$income), max(Default$income), length.out = 100))

new_data$predicted_prob = predict(model_1, newdata = new_data, type = "response")


#binary mapping
Default$default_binary = ifelse(Default$default == "No", 0, 1)

ggplot(new_data, aes(x = balance, y = predicted_prob)) +
  geom_point(data = Default, aes(x = balance, y = default_binary, color = default), alpha = 0.5) + 
  geom_line(color = "blue") +
  labs(x = "Credit Card Balance", y = "Probability of Default", title = "Logistic Regression Model(default ~ income + balance)") +
  theme_minimal()

summary(model_1)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

ans) Although both the income and the balance are statistically significant, it can be seen that the balance has a larger z-value than the income. and coefficient of balance is much bigger than income. Therefore, the balance is more significant in estimating default.

additional(5 - 1).Fit a logistic regression model only using balance as predictor.
set.seed(222)
model_2 = glm(default ~ balance, data = Default, family = "binomial")

new_data = data.frame(balance = seq(min(Default$balance), max(Default$balance), length.out = 100))

new_data$predicted_prob = predict(model_2, newdata = new_data, type = "response")

#binary mapping
Default$default_binary = ifelse(Default$default == "No", 0, 1)

#draw
ggplot(new_data, aes(x = balance, y = predicted_prob)) +
  geom_point(data = Default, aes(x = balance, y = default_binary, color = default), alpha = 0.5) + 
  geom_line(color = "blue") +
  labs(x = "Credit Card Balance", y = "Probability of Default", title = "Logistic Regression Model(default ~ income + balance)") +
  theme_minimal()

summary(model_2)
## 
## Call:
## glm(formula = default ~ balance, family = "binomial", data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

(b) Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:

i. Split the sample set into a training set and a validation set.

I divided the data into two part, training set and validation set.
(training set : validation set = 7 : 3)

set.seed(222)
train_idx = sample(1:nrow(Default), 0.7 * nrow(Default))

train_data = Default[train_idx, ]
validation_data = Default[-train_idx, ]
ii. Fit a multiple logistic regression model using only the training observations.
model_3 = glm(default ~ balance + income, data = train_data, family = "binomial")
iii. Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
valid_pred = predict(model_3, newdata= validation_data, type = "response")
valid_pred_class = ifelse(valid_pred > 0.5, 1, 0)

ans) If the softmax value is higher than 0.5, it is classified as “Up”.

iv. Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
actual_class = validation_data$default_binary
confusion_matrix = table(actual = actual_class, predicted = valid_pred_class)
confusion_matrix
##       predicted
## actual    0    1
##      0 2885   12
##      1   67   36
TN = 2885
FP = 12
FN = 67
TP = 36

BCE = -mean(actual_class  * log(valid_pred) + (1 - actual_class) * log(1 - valid_pred))
accuracy = (TP + TN)/(TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
specificity = (TN) / (TN + FP)

validation_set_error = (FP + FN) / (TP + FP + TN + FN)

f1 = 2 * (precision * recall)/(precision + recall)
matrics = data.frame(
  Matric = c("BCE", "accuracy","precision", "specificity", "recall", "f1-score", "validation_set_error"),
  Value = c(BCE, accuracy, precision, specificity, recall, f1, validation_set_error)
)

print(matrics)
##                 Matric      Value
## 1                  BCE 0.08319640
## 2             accuracy 0.97366667
## 3            precision 0.75000000
## 4          specificity 0.99585778
## 5               recall 0.34951456
## 6             f1-score 0.47682119
## 7 validation_set_error 0.02633333

ans) In my opinion, the accuracy value was high because there are lots of “No” labeled data than the “Yes” label. additionally, recall value was lower than others.

(c) Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.

i. Split the sample set into a training set and a validation set.
#First
set.seed(223)
train_idx_first = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_first = Default[train_idx_first, ]
validation_data_first = Default[-train_idx_first, ]

#Second
set.seed(224)
train_idx_second = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_second = Default[train_idx_second, ]
validation_data_second = Default[-train_idx_second, ]

#Third
set.seed(225)
train_idx_third = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_third = Default[train_idx_third, ]
validation_data_third = Default[-train_idx_third, ]
ii. Fit a multiple logistic regression model using only the training observations
#First
model_first = glm(default ~ balance + income, data = train_data_first, family = "binomial")
#Second
model_second = glm(default ~ balance + income, data = train_data_second, family = "binomial")
#Third
model_thrid = glm(default ~ balance + income, data = train_data_third, family = "binomial")
iii. Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
#First
valid_pred_first = predict(model_first, newdata= validation_data, type = "response")
valid_pred_class_frist = ifelse(valid_pred_first > 0.5, 1, 0)

#Second
valid_pred_second = predict(model_second, newdata= validation_data, type = "response")
valid_pred_class_second = ifelse(valid_pred_second > 0.5, 1, 0)

#Third
valid_pred_third = predict(model_thrid, newdata= validation_data, type = "response")
valid_pred_class_third = ifelse(valid_pred_third > 0.5, 1, 0)
iv. Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
#First
actual_class_first = validation_data_first$default_binary
confusion_matrix_first = table(actual = actual_class_first, predicted = valid_pred_class_frist)

#Second
actual_class_second = validation_data_second$default_binary
confusion_matrix_second = table(actual = actual_class_second, predicted = valid_pred_class_second)

#Third
actual_class_third = validation_data_third$default_binary
confusion_matrix_third = table(actual = actual_class_third, predicted = valid_pred_class_third)
#First
confusion_matrix_first
##       predicted
## actual    0    1
##      0 2849   46
##      1  105    0

ans) It was confirmed that the case where Default is “yes” was not classified well.

TN_first = 2849
FP_first = 46
FN_first = 105
TP_first = 0

BCE_first = -mean(actual_class_first  * log(valid_pred_first) + (1 - actual_class_first) * log(1 - valid_pred_first))
accuracy_first = (TP_first + TN_first)/(TP_first + FP_first + TN_first + FN_first)
precision_first = TP_first / (TP_first + FP_first)
recall_first = TP / (TP_first + FN_first)
f1_first = 2 * (precision_first * recall_first)/(precision_first + recall_first)
specificity_first = (TN_first) / (TN_first + FP_first)

validation_set_error_first = (FP_first + FN_first) / (TP_first + FP_first + TN_first + FN_first)
confusion_matrix_second
##       predicted
## actual    0    1
##      0 2851   46
##      1  102    1

ans) It was confirmed that the case where Default is “yes” was not classified well.

TN_second = 2851
FP_second = 46
FN_second = 102
TP_second = 1

BCE_second = -mean(actual_class_second  * log(valid_pred_second) + (1 - actual_class_second) * log(1 - valid_pred_second))
accuracy_second = (TP_second + TN_second)/(TP_second + FP_second + TN_second + FN_second)
precision_second = TP_second / (TP_second + FP_second)
recall_second = TP_second / (TP_second + FN_second)
f1_second = 2 * (precision_second * recall_second)/(precision_second + recall_second)
specificity_second = (TN_second) / (TN_second + FP_second)
validation_set_error_second = (FP_second + FN_second) / (TP_second + FP_second + TN_second + FN_second)
confusion_matrix_third
##       predicted
## actual    0    1
##      0 2856   47
##      1   97    0

ans) It was confirmed that the case where Default is “yes” was not classified well.

TN_third = 2856
FP_third = 47
FN_third = 97
TP_third = 0

BCE_third = -mean(actual_class_third  * log(valid_pred_third) + (1 - actual_class_third) * log(1 - valid_pred_third))
accuracy_third = (TP_third + TN_third)/(TP_third + FP_third + TN_third + FN_third)
precision_third = TP_third / (TP_third + FP_third)
recall_third = TP_third / (TP_third + FN_third)
f1_third = 2 * (precision_third * recall_third)/(precision_third + recall_third)
specificity_third = (TN_third) / (TN_third + FP_third)
validation_set_error_third = (FP_third + FN_third) / (TP_third + FP_third + TN_third + FN_third)
matrics_three_times = data.frame(
  Matric = c("BCE", "accuracy", "precision", "specificity","recall", "f1-score", validation_set_error),
  first = c(BCE_first, accuracy_first, precision_first, specificity_first, recall_first, f1_first, validation_set_error_first),
  second = c(BCE_second, accuracy_second, precision_second, specificity_second,recall_second, f1_second, validation_set_error_second),
  third = c(BCE_third, accuracy_third, precision_third, specificity_third,recall_third, f1_third, validation_set_error_third)
)

print(matrics_three_times)
##               Matric      first      second     third
## 1                BCE 0.25866531 0.245639483 0.2455566
## 2           accuracy 0.94966667 0.950666667 0.9520000
## 3          precision 0.00000000 0.021276596 0.0000000
## 4        specificity 0.98411054 0.984121505 0.9838099
## 5             recall 0.34285714 0.009708738 0.0000000
## 6           f1-score 0.00000000 0.013333333       NaN
## 7 0.0263333333333333 0.05033333 0.049333333 0.0480000

ans)For all cases, all Matrics have low recall values. because TP value had 0 about most of cases, other Matrics values(precision, f1-score…) was not clear.

7.In Sections 5.3.2 and 5.3.3, we saw that the cv.glm() function can be used in order to compute the LOOCV test error estimate. Alternatively, one could compute those quantities using just the glm() and predict. glm() functions, and a for loop. You will now take this approach in order to compute the LOOCV error for a simple logistic regression model on the Weekly data set. Recall that in the context of classification problems, the LOOCV error is given in (5.4).

library(boot)
data("Weekly")

(a) Fit a logistic regression model that predicts Direction using Lag1 and Lag2.

glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22122    0.06147   3.599 0.000319 ***
## Lag1        -0.03872    0.02622  -1.477 0.139672    
## Lag2         0.06025    0.02655   2.270 0.023232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1488.2  on 1086  degrees of freedom
## AIC: 1494.2
## 
## Number of Fisher Scoring iterations: 4

ans)Only Lag2 seems statistically significant.

(b) Fit a logistic regression model that predicts Direction using Lag1 and Lag2 using all but the first observation.

weekly_subset = Weekly[-1, ]
model_weekly = glm(Direction ~ Lag1 + Lag2, data = weekly_subset, family="binomial")

summary(model_weekly)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = weekly_subset)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22324    0.06150   3.630 0.000283 ***
## Lag1        -0.03843    0.02622  -1.466 0.142683    
## Lag2         0.06085    0.02656   2.291 0.021971 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1494.6  on 1087  degrees of freedom
## Residual deviance: 1486.5  on 1085  degrees of freedom
## AIC: 1492.5
## 
## Number of Fisher Scoring iterations: 4

ans) It has a coefficient similar to that before. Lag2(0.06025 -> 0.06085)

(c) Use the model from (b) to predict the direction of the first observation. You can do this by predicting that the first observation will go up if P (Direction = “Up”|Lag1, Lag2) > 0.5. Was this observation correctly classified?

valid_pred_weekly = predict(model_weekly, newdata= Weekly[1, ], type = "response")
valid_pred_class_weekly = ifelse(valid_pred_weekly > 0.5, "Up", "Down")
real = as.character((Weekly[1, ]$Direction))
est = as.character(valid_pred_class_weekly)
print("*****<real>*******")
## [1] "*****<real>*******"
print(real)
## [1] "Down"
print("*****<est>*******")
## [1] "*****<est>*******"
print(est)
## [1] "Up"

(d) Write a for loop from i = 1 to i = n,where n is the number of observations in the data set, that performs each of the following steps:

i. Fit a logistic regression model using all but the ith observation to predict Direction using Lag1 and Lag2.
set.seed(20)
errors = numeric(nrow(Weekly))
n = nrow(Weekly)
for (i in 1:n) {
  model = glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family="binomial")
  prediction = predict(model, newdata = Weekly[i, ], type="response")
  
actual = ifelse(Weekly[i, "Direction"] == "Up", 1, 0)
#binary cross-entropy
errors[i] = -actual * log(prediction) - (1 - actual) * log(1 - prediction)
}

errors_mean = mean(errors)
plot(errors, type = "l", xlab = "ith", ylab = "Binary Cross Entropy", main = "LOOCV Errors")

print(tail(errors))
## [1] 0.7404976 0.5736674 0.6686027 0.5349879 0.5605849 0.5998062
print(errors_mean)
## [1] 0.6860653
ii. Compute the posterior probability of the market moving up for the ith observation.
prob = rep(0, nrow(Weekly))
for (i in 1:nrow(Weekly)) {
  prob[i] = predict(model, newdata=Weekly[i, ], type="response")
}
print(mean(prob))
## [1] 0.5551413

ans) The probability prediction seems to be close to Up.

plot(prob, type = "l", xlab = "ith", ylab = "probability", main = "ith observation")

##### iii. Use the posterior probability for the ith observation in order to predict whether or not the market moves up.

Weekly$binary_direction = ifelse(Weekly$Direction == "Up", 1, 0)
prob_class = ifelse(prob > 0.5, 1, 0)
print(length(Weekly$binary_direction))
## [1] 1089
print(length(prob_class))
## [1] 1089
iv. Determine whether or not an error was made in predicting the direction for the ith observation. If an error was made, then indicate this as a 1, and otherwise indicate it as a 0.
confusion_matrix_loocv = table(Weekly$binary_direction, prob_class)
print(confusion_matrix_loocv)
##    prob_class
##       0   1
##   0  39 445
##   1  39 566

(e) Take the average of the n numbers obtained in (d)iv in order to obtain the LOOCV estimate for the test error. Comment on the results.

TN = 39
FP = 445
FN = 39
TP = 566

BCE = -mean(Weekly$binary_direction  * log(prob) + (1 - Weekly$binary_direction) * log(1 - prob))
accuracy = (TP + TN)/(TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
specificity = (TN) / (TN + FP)

validation_set_error = (FP + FN) / (TP + FP + TN + FN)

f1 = 2 * (precision * recall)/(precision + recall)
matrics_loocv = data.frame(
  Matric = c("BCE", "accuracy", "precision", "specificity","recall", "f1-score", "validation_set_error"),
  Value = c(BCE, accuracy, precision, specificity, recall, f1, validation_set_error)
)

print(matrics_loocv)
##                 Matric      Value
## 1                  BCE 0.68329716
## 2             accuracy 0.55555556
## 3            precision 0.55984174
## 4          specificity 0.08057851
## 5               recall 0.93553719
## 6             f1-score 0.70049505
## 7 validation_set_error 0.44444444

ans) It shows a higher recall value than the default data set. This seems to be because the snp500 index has been steadily rising from 1990 to 2010. Other indicators are difficult to see as significant.